package org.biogroovy.io.local

import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.pdmodel.PDDocumentInformation
import org.biogroovy.models.Article
import org.biogroovy.models.Author
import org.biogroovy.models.MeshHeading
import org.biogroovy.models.Journal

/**
 * This class reads PDF files and extracts metadata, and injects PubMed 
 * metadata into PDF files. This turns journal articles into a resource
 * that can be mined.
 */
class PDFArticleSerializer {

    /** The key used to store/retrieve abstract information. */
    static final String ABSTRACT = "Abstract"

    /** The key used to store/retrieve the title of the journal. */
    static final String JOURNAL_TITLE = "JournalTitle"

    /** The key used to store/retrieve the ISSN of the journal. */
    static final String JOURNAL_ISSN = "JournalISSN"

    /** The key used to store/retrieve the volume of the journal. */
    static final String JOURNAL_VOLUME = "JournalVolume"

    /** The key used to store/retrieve the issue of the journal. */
    static final String JOURNAL_ISSUE = "JournalIssue"

    /** The key used to store/retrieve the publication date of the journal. */
    static final String JOURNAL_PUBLICATION_DATE = "JournalPublicationDate"

    /** The key used to store/retrieve the PubMed ID. */
    static final String PUBMED_ID = "PMID";

    /** The key used to store/retrieve the list of authors. */
    static final String AUTHORS = "Authors";

    /**
     * This method reads the metadata of a PDF file from a stream.
     * @param stream the input stream of the PDF file.
     * @return the metadata for the article.
     */
    public Article readMetadata(InputStream stream){

        Article article = new Article();

        PDDocument document = PDDocument.load(stream);
        PDDocumentInformation info = document.getDocumentInformation();

        article.title = info.getTitle();

        article.dateCreated = info.getCreationDate()?.getTime();
        article.dateRevised = info.getModificationDate()?.getTime();
        article.pubmedId = info.getCustomMetadataValue(PUBMED_ID);

        if (info.keywords != null){
            info.keywords.split(";").each{ keyword ->
                article.meshHeadings.add(new MeshHeading(keyword));
            }
        }

        article.abs = info.getCustomMetadataValue(ABSTRACT);
        article.journal.title = info.getCustomMetadataValue(JOURNAL_TITLE);
        article.journal.issn = info.getCustomMetadataValue(JOURNAL_ISSN);
        article.journal.volume = info.getCustomMetadataValue(JOURNAL_VOLUME);
        article.journal.issue = info.getCustomMetadataValue(JOURNAL_ISSUE);

        String date = info.getCustomMetadataValue(JOURNAL_PUBLICATION_DATE);
        if (date != null){
            article.journal.publicationDate = new Date(date);
        }

        article.pubmedId = info.getCustomMetadataValue(PUBMED_ID);

        // parse the author list
        if (info.getCustomMetadataValue(AUTHORS) != null){
            String[] authorArray = info.getCustomMetadataValue(AUTHORS).split(";");
            Author currAuthor
            for(String author : authorArray){
                String[] authNameArray = author.split(",");
                currAuthor = new Author(firstname: authNameArray[1], lastname: authNameArray[0]);
                article.authors.add(currAuthor);
            }
        }


        return article;
    }

    /**
     * This method reads the metadata from a file and returns an article.
     * @param file the PDF file to be read.
     * @return the article metadata.
     * @throws IOException if there is a problem reading the file.
     */
    public Article readMetadata(String file) throws IOException{
        return readMetadata(new URI(file).toURL().openStream());
    }

    /**
     * A convenience method for reading the metadata from a file.
     * @param file the PDF file to be read.
     * @return the article metadata.
     * @throws IOException if there is a problem reading the file.
     */
    public Article readMetadata(File file) throws IOException{
        return readMetadata(new FileInputStream(file));
    }

    /**
     * This method updates a PDF file with PubMed metadata.
     * @param file the PDF file to be updated.
     * @param article the article metadata
     * @throws IOException if there is a problem updating the PDF file.
     */
    public void writeMetadata(String file, Article article) throws IOException{

        PDDocument document = PDDocument.load(file);
        PDDocumentInformation info = document.getDocumentInformation();

        updatePDF(info, article);

        document.save(file);

    }

    /**
     * This method is responsible for updating the PDF Document metadata.
     * @param info the PDF document metadata.
     * @param article the article (PubMed) metadata.
     */
    private void updatePDF(PDDocumentInformation info, Article article){
        info.title = article.title;

        if (article.dateCreated != null){
            Calendar creationDate = Calendar.getInstance();
            info.setCreationDate(creationDate.setTime(article.dateCreated));
        }

        if (article.dateRevised != null){
            Calendar modDate = Calendar.getInstance();
            info.setModificationDate(modDate.setTime(article.dateRevised));
        }

        info.setCustomMetadataValue(ABSTRACT, article.abs);
        info.setCustomMetadataValue(JOURNAL_TITLE, article.journal.title);
        info.setCustomMetadataValue(JOURNAL_ISSN, article.journal.issn);
        info.setCustomMetadataValue(JOURNAL_VOLUME, article.journal.volume);
        info.setCustomMetadataValue(JOURNAL_ISSUE, article.journal.issue);
        info.setCustomMetadataValue(JOURNAL_PUBLICATION_DATE, article.journal.publicationDate.toString());
        info.setCustomMetadataValue(PUBMED_ID, article.pubmedId);

        info.setCustomMetadataValue(AUTHORS, article.authors.join(";"));

        String keywords = "";
        for(MeshHeading meshHeading : article.meshHeadings){
            if (meshHeading.descriptorName != null){
                keywords += meshHeading.toString();
            }
        }
        info.setKeywords(keywords);

    }

    /**
     * This method is responsible for updating the PDF file with metadata from the Article object.
     * @param inputFile the PDF file to be updated.
     * @param outFile the output file.
     * @param article the article metadata.
     */
    public void writeMetadata(InputStream inputFile, OutputStream outFile, Article article){
        PDDocument document = PDDocument.load(inputFile);
        PDDocumentInformation info = document.getDocumentInformation();

        updatePDF(info);
        document.setDocumentInformation(info);

        document.save(outFile);
        inputFile.close();
        outFile.close();
    }


}
